/*------------------------------------------------------------------------------------------------
*------------------------------------------------------------------------------------------------
This code produces the maps in Appendix C: 
Figure C1: Geographic representation of the Clarity sample relative to surveys of non-bank borrowers.
This figure presents the geographic distribution of non-bank loan borrowers in the Clarity sample as well as in the IRSFFA (national survey
of low-to-middle income online tax filers) and SEIC (national survey taken during the early stages of COVID-19) surveys. In Panel A., the
outcome variable is the percentage of the state’s adult population that is represented in the Clarity database. This share is calculated by taking the
total number of distinct Clarity borrowers in our inquiries sample for a given state-year, dividing that number by the corresponding state adult
population (gathered from the American Community Survey), then averaging that percentage across years. Finally, we multiply that percentage
by 24.30 (= 63,000,000 / 2,592,099) to account for the fact that we only see a 4.1% random sample of the full Clarity consumer set. In Panels B and
C, the outcome variable is the percentage of survey respondents who report recent use of a non-bank loan, by state. The legend depicts quartiles
of the outcome variable, such that states with the darkest shade are in the top quartile of the outcome variable across states.
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ */

clear

/*-------------------------------------------
Global paths to import the data.
-------------------------------------------*/

global cd ""  //Set the path to the root folder ending in RFSDelivery
global path_md "${cd}/Data/IRSFFA/Sample" // input HFS data
global path_clarity "${cd}/Data/Clarity" // clarity data
global ttype "csv"  // change to "tex" if you want tex instead of "csv" outputs

/*-------------------------------------------
Clean the HFS data (more for the descriptive results, not really relevant for the maps produced at the end of this code)
-------------------------------------------*/
*use "${path_md}/Cleaned/hfsdf_f_08292021.dta" , replace
use "${path_md}/irsffa_cleaned_sample.dta" , replace

*Generate some group variables
gen year = yearf
egen yw = group(year wave)
egen stateg = group(state_abbrev)
egen countyg = group(county)
egen county_wave = group(county wave)
egen cbsa_year = group(cbsa year)
egen cbsa_wave = group(cbsa wave)
egen state_year = group(stateg year)
egen state_wave = group(stateg wave)
egen health_insuranceg = group(health_insurance)
label var year "Year"
label var wave "Wave"
label var stateg "State"
label var state_year "State \(\times\) Year"
label var state_wave "State \(\times\) Wave"
label var cbsa_year "CBSA \(\times\) Year"
label var yw "Year-Wave"
gen sample=1
gen antiplasma = 1 - plas_any if (plas_any!=.)
label var plas_any "Plasma"
*Fix up some of the labels (for consistency)
gen children_gt0 = (children>0) if (children!=.)
gen children_gt1 = (children>1) if (children!=.)
gen children_gt2 = (children>1) if (children!=.)
label var children "Children"
label var children_gt1 "Children gt1"
label var children_gt2 "Children gt2"
label var children_gt0 "Children Any"
gen children_sq = children ^ 2 if (children!=.)
label var children_sq "Children^2"
gen age_sq = age ^ 2
label var age_sq "Age sq."
gen singleparent = 0 if (married!=.)&(children!=.)
replace singleparent = 1 if (married==0)&(children>0)
label var singleparent "Single Parent"
gen age_cat_d = 1*(age>23) + 1*(age>29) + 1*(age>43)
gen age_lt23 = 1*(age<23) if age!=.
gen age_23to28 = 1*((age>=23)&(age<=28)) if age!=.
gen age_29to42 = 1*((age>=29)&(age<=42)) if age!=.
gen age_gt42 = 1*(age>42) if age!=.
gen age_gt35 = 1*(age>35) if age!=.
label var age_gt35 "Age gt35"
gen age_lte45 = 1*(age<=45) if age!=.
label var age_lte45 "Age lte45"
gen age_lte40 = 1*(age<=40) if age!=.
label var age_lte40 "Age lte40"
gen age_lte35 = 1*(age<=35) if age!=.
label var age_lte35 "Age lte35"
gen age_lte30 = 1*(age<=30) if age!=.
label var age_lte30 "Age lte30"
gen age_lt35_EG = 1*(age<35) if age!=.
gen age_35to50_EG = 1*((age>=35)&(age<50)) if age!=.
gen age_50to65_EG = 1*((age>=50)&(age<65)) if age!=.
gen age_gte65_EG = 1*(age>65) if age!=.
gen hhinc_lt25k_c = hhinc_lt10k + hhinc_lt15k + hhinc_lt25k 
gen hhinc_lt50k_c = hhinc_lt25k_c + hhinc_lt35k + hhinc_lt50k
gen student_ft = 1*(student==2) if student!=.
gen student_pt = 1*(student==1) if student!=.
label var student_ft "Student FT"
label var student_pt "Student PT"
cap drop student_s
gen student_s = 1*(student>0) if student!=.
label var student_s "Student"
gen male = 1*(gender=="Male") if gender!=""
gen emp_ft = 1*(employ==0) if employ!=.
gen emp_pt = 1*(employ==3) if employ!=.
gen emp_ue = 1*(employ==4) if employ!=.
label var male "Male"
label var emp_ft "Employed FT"
label var emp_pt "Employed PT"
label var emp_ue "Unemployed"
gen plas_numcp = plas_numc if (plas_numc>0)&(plas_numc!=.)
label var plas_numcp "Plasma Num"
gen bach = 0 if (educs!="")
replace bach = 1 if (educs=="bach")|(educs=="grad")
label var bach "Bachelor's or higher"
gen cbsad = 1*(cbsa<99999) if cbsa!=.
cap drop plasma_ind50_500 
egen plasma_ind50_500 = rowtotal(plasma_ind50 plasma_ind100 plasma_ind250 plasma_ind500)
* indicator to identify survey waves and years of HFS and SEIC
gen s1819 = 1*(year<2021)&(year>2017)
gen s21 = 1*(year==2021)
gen s21w1 = 1*(year==2021)&(wave==1)
gen s21w2 = 1*(year==2021)&(wave==2)
gen s21w3 = 1*(year==2021)&(wave==3)
gen s21w4 = 1*(year==2021)&(wave==4)
gen s21w5 = 1*(year==2021)&(wave==5)
* More cleaning (none of this is used in this code)
*Adjust the deciles to something easier to display.
replace fin_income_q10 = fin_income_q10 +1 
replace pop2sqkm_wq10 = pop2sqkm_wq10 +1
gen s1819w1 = 1*(year<2021)&(wave==1)
gen s1819a30 = 1*(year<2021)&(age_lte30==1) 
gen s1819a35 = 1*(year<2021)&(age_lte35==1)
gen s1819a40 = 1*(year<2021)&(age_lte40==1)
gen s1819a45 = 1*(year<2021)&(age_lte45==1)
gen s1819s1 = 1*(year<2021)&(shock_any==1) if shock_any!=.
gen s1819inc = 1*(year<2021)&(fin_income_q10>5) if fin_income_q10!=.
gen s1819bach = 1*(year<2021)&(bach==1) if bach!=.
/* deal with weird coding of plasma frequency variable*/
tab plas_num yearf if yearf<2021 & plas_any==1
// since the question is predicated on having sold plasma at least once in the last 6 months, change zeros to 1
replace plas_numc = . if plas_any==0
replace plas_numc=1 if plas_any==1 & plas_numc==0
// 2018 survey was categoricial with more than 10 times set to 11 (2019 survey was continuous), must clip at 11 to make the two surveys comparable
* hfsdf['plas_numc'] = np.where(hfsdf['plas_any']==1,hfsdf['plas_num'].clip(lower=0,upper=11),np.nan)
gen plas_gt1 = 1*(plas_numc>1) if plas_any!=.
gen plas_gt2 = 1*(plas_numc>2) if plas_any!=.
gen plas_gt6 = 1*(plas_numc>6) if plas_any!=.
label var plas_gt1 "PC gt1"
label var plas_gt1 "PC gt2"
label var plas_gt6 "PC gt6"
replace plas_numc = . if plas_any==.
gen afs_kite = hard_kite
label var afs_kite "AFS Overdraft"
cap drop rm socdis_safety_mean
egen rm = rowmiss(socdis_mask socdis_inform socdis_wash socdis_gather socdis_distance socdis_home)
egen socdis_safety_mean = rowmean(socdis_mask socdis_inform socdis_wash socdis_gather socdis_distance socdis_home) if rm<2
label var socdis_safety_mean "SocDis Safety"
cap drop rm 
gen fl_lt_q2 = 1*(fl_lt_q3>0) if fl_lt_q3 != .
gen fa_lt_q2 = 1*(fa_lt_q3>0) if fa_lt_q3 != .
gen fl_lt_q2l = 1*(fl_lt_q2==0) if fl_lt_q2 != .
gen fa_lt_q2l = 1*(fa_lt_q2==0) if fa_lt_q2 != .
label var fl_lt_q2l "Fin Liab LT Low"
label var fa_lt_q2l "Fin Asset LT Low"
gen fa_st_q3l = 1*(fa_st_q3==0) if fa_st_q3!=.
gen fa_st_q3m = 1*(fa_st_q3==1) if fa_st_q3!=.
gen fl_st_q3l = 1*(fl_st_q3==0) if fl_st_q3!=.
gen fl_st_q3m = 1*(fl_st_q3==1) if fl_st_q3!=.
label var fl_st_q3l "Fin Liab ST Low"
label var fa_st_q3l "Fin Asset ST Low"
label var fl_st_q3m "Fin Liab ST Mid"
label var fa_st_q3m "Fin Asset ST Mid"
* sum short and long term liabilities together since that break out is overly complicated
egen liab = rowtotal(fl_lt fl_st) // sum short and long term liab together
sort yearf wave
by yearf wave: egen liab_q3 = xtile(liab) if !missing(liab), nq(3)   
g liab_q3_low = (liab_q3==1)
g liab_q3_mid = (liab_q3==2)
g liab_q3_high = (liab_q3==3)
label var liab_q3_low "Low Liabilities"
label var liab_q3_mid "Mid Liabilities"
label var liab_q3_high "High Liabilities"
egen asset = rowtotal(fa_lt fa_st) // sum short and long term liab together
sort yearf wave
by yearf wave: egen asset_q3 = xtile(asset) if !missing(asset), nq(3)   
g asset_q3_low = (asset_q3==1)
g asset_q3_mid = (asset_q3==2)
g asset_q3_high = (asset_q3==3)
label var asset_q3_low "Low Assets"
label var asset_q3_mid "Mid Assets"
label var asset_q3_high "High Assets"
cap drop gig_yesno_m
gen gig_yesno_m = gig_yesno
replace gig_yesno_m = 0 if (year==2021)&(wave>1)&((gig_recent_time !="")|(gig_ever!=.))
replace gig_yesno_m = 1 if (year==2021)&(wave>1)&(gig_recent_time !="")&((gig_recent_time == "3 to 6 months ago")|(gig_recent_time == "6 to 12 months ago")|(gig_recent_time == "Within the last 3 months"))
label var gig_yesno_m "Gig"
label var health_selfcompare "Health Physical"
label var health_selfmental "Health Mental"
gen health_selfcompare_pp = health_selfcompare_6mo
replace health_selfcompare_pp = health_selfcompare_3mo if health_selfcompare_pp==.
gen health_selfmental_pp = health_selfmental_6mo
replace health_selfmental_pp = health_selfmental_3mo if health_selfmental_pp==.
label var health_selfcompare_pp "Health Phys PP (3/13/2020)"
label var health_selfmental_pp  "Health Mental PP (3/13/2020)"
gen health_selfcompare_delt = health_selfcompare - health_selfcompare_pp
gen health_selfmental_delt = health_selfmental - health_selfmental_pp
label var health_selfcompare_delt "Health Phys \(\Delta\)"
label var health_selfmental_delt  "Health Mental \(\Delta\)"
gen health_selfcompare_delt3m = health_selfcompare - health_selfcompare_3m
gen health_selfmental_delt3m = health_selfmental - health_selfmental_3m
label var health_selfcompare_delt3m "Health Phys \(\Delta\) 3M"
label var health_selfmental_delt3m  "Health Mental \(\Delta\) 3M"
replace covid_care = 0 if covid_symptom == 0
replace covid_positive = 0 if covid_test == 0
cap drop health_covidrisk_m health_covidrisk_sp_m 
gen health_covidrisk_m = health_covidrisk
replace health_covidrisk_m = 0 if (health_covidrisk_fam==0)&(year==2021)
gen health_covidrisk_sp_m = health_covidrisk_sp
replace health_covidrisk_sp_m = 0 if (health_covidrisk_fam==0)&(year==2021)
foreach v in pop2sqkm_wq10 fin_income_q10 {
forvalues i=1/9 {
	cap drop `v'_`i'
	gen `v'_`i' = 1*(`v' == `i') if `v' != .
	if "`v'"=="fin_income_q10" {
		label var `v'_`i' "Income Dec`i'"
	}
}
}
*Did you donate in the past year (comparable to gig_yesno for 2018-19)
cap drop gig_yesno_21 
gen gig_yesno_21 = 1*(gig_recent_time!="more than a year ago") if ((year==2021)&(wave>1)&(gig_recent_time!=""))
replace gig_yesno_21 = 0 if ((year==2021)&(wave>1)&(gig_recent_time==""))
tab gig_yesno_21 
cap drop emer_2k_no emer_2k_probnot emer_2k_maybe emer_2k_yes
gen emer_2k_no = 1*(emer_2k==1) if (emer_2k!=.)
gen emer_2k_probnot = 1*(emer_2k==2) if (emer_2k!=.)
gen emer_2k_maybe = 1*(emer_2k==3) if (emer_2k!=.)
gen emer_2k_yes = 1*(emer_2k==4) if (emer_2k!=.)
label var emer_2k_no "Emergency 2K? --No"
label var emer_2k_probnot "Emergency 2K? --Prob Not"
label var emer_2k_maybe "Emergency 2K? --Maybe"
gen plan_per_days = 1*(plan_per_m==1) if plan_per_m!=.
gen plan_per_weeks = 1*(plan_per_m==2) if plan_per_m!=.
gen plan_per_months = 1*(plan_per_m==3) if plan_per_m!=.
label var plan_per_days "Planning Horizon: Next few days"
label var plan_per_weeks "Planning Horizon: Next few weeks"
label var plan_per_months "Planning Horizon: Next few months"
*Winsorize financial income
gen fin_income_f = inc_agi if wave==1
replace fin_income_f = fin_income_m if year==2021
winsor2 fin_income_f, c(0 99) s("_w") by(year wave)
label var fin_income_f_w "Income"
* more cleaning (not relevant for the maps produced by this code)
global samples 	s1819
cap drop samp_2 samp_3
gen samp_2	= "IRSFFA" if (year<2021)&(year>2017)
replace samp_2 = "SEIC" if year==2021
cap drop samp_3
gen samp_3	= "IRSFFA 2018-19" if (year<2021)&(year>2017)
replace samp_3 = "SEIC 5/20" if (year==2021)&(wave==1)
replace samp_3 = "SEIC 8/20 - 5/21" if (year==2021)&(wave>1)
cap drop samp_3g
gen samp_3g	= 1 if (year<2021)&(year>2017)
replace samp_3g = 2 if (year==2021)&(wave==1)
replace samp_3g = 3 if (year==2021)&(wave>1)
g nonwhite = (white==0)
g female = (male==0)
g emp = (emp_ue==0)
gen fin_income_q10c_sq = fin_income_q10^2 if fin_income_q10!=.
gen fin_income_q10c_lte3 = 1*(fin_income_q10<4) if fin_income_q10!=.
gen fin_income_q10c_4to6 = 1*(fin_income_q10<7)*(fin_income_q10>3) if fin_income_q10!=.
label var fin_income_q10c_lte3 "Income Low"
label var fin_income_q10c_4to6 "Income Low-Mid"
gen fa_lt_gt0 = 1*(fa_lt>0) if fa_lt!=.  
gen fl_lt_gt0 = 1*(fl_lt>0) if fl_lt!=.
label var fa_lt_gt0 "Fin Asset LT gt0"
label var fl_lt_gt0 "Fin Liab LT gt0"
replace cred_score_poor = . if cred_score ==.
replace cred_score_mid = . if cred_score ==.
* emer_2k_no emer_2k_probnot emer_2k_maybe
gen emer_2k_no_probnot = ( (emer_2k_no==1) | (emer_2k_probnot==1) )
replace emer_2k_no_probnot=. if (missing(emer_2k_no) & missing(emer_2k_probnot) & missing(emer_2k_maybe) & missing(emer_2k_yes))
label var emer_2k_no_probnot "Handle $2k emergency - no"
*plan_per_days plan_per_weeks plan_per_months 
gen plan_ST = ( (plan_per_days==1) | (plan_per_weeks==1) )
replace plan_ST=. if (missing(plan_per_m))
label var plan_ST "Planning horizon - days or weeks"
*credhab_payfull credhab_carrybal credhab_min credhab_late credhab_miss  
gen cc_del = ( (credhab_late==1) | (credhab_miss==1) )
replace cc_del=. if (missing(credhab_carrybal) & missing(credhab_late) & missing(credhab_min) & missing(credhab_miss) & missing(credhab_payfull))
label var cc_del "Credit card delinquency"
label var resource400_cash "Cash"
label var resource400_paynext "Credit card (next due date)"
label var resource400_paytime "Credit card (pay over time)"
label var resource400_loan "Bank loan"
label var resource400_afs "Non-bank loan"
label var resource400_fam "Family"
label var resource400_sell "Sell something"
label var resource400_other "Other"
label var resource400_none "Cannot meet"
gen resource400_cc = (resource400_paynext==1 | resource400_paytime==1)
replace resource400_cc = . if (missing(resource400_paynext) & missing(resource400_paytime))
label var resource400_cc "Credit Card"
* education dummies
gen somecol = (educ==12)
replace somecol = . if missing(educ)
label var somecol "Some College"
gen highschool = (educ==8)
replace highschool = . if missing(educ)
label var highschool "Associate's Degree"
gen associates = (educ==0)
replace associates = . if missing(educ)
label var associates "High School Degree"
* income dummies (since income deciles are not directly interpretable in IRSFFA, which is focused on LMI)
gen income_lt10k = (fin_income_m2<10000) if !missing(fin_income_m2)
gen income_10_20k = (fin_income_m2<20000 & fin_income_m2>=10000) if !missing(fin_income_m2)
gen income_20_30k = (fin_income_m2<30000 & fin_income_m2>=20000) if !missing(fin_income_m2)
gen income_30_40k = (fin_income_m2<40000 & fin_income_m2>=30000) if !missing(fin_income_m2)
gen income_40_50k = (fin_income_m2<50000 & fin_income_m2>=40000) if !missing(fin_income_m2)
gen income_50_60k = (fin_income_m2<60000 & fin_income_m2>=50000) if !missing(fin_income_m2)
gen income_60_70k = (fin_income_m2<70000 & fin_income_m2>=60000) if !missing(fin_income_m2)
gen income_70_80k = (fin_income_m2<80000 & fin_income_m2>=70000) if !missing(fin_income_m2)
gen income_80_90k = (fin_income_m2<90000 & fin_income_m2>=80000) if !missing(fin_income_m2)
gen income_90_100k = (fin_income_m2<100000 & fin_income_m2>=90000) if !missing(fin_income_m2)
label var income_lt10k "Income <$10k"
label var income_10_20k "Income [$10k-20k)"
label var income_20_30k "Income [$20k-30k)"
label var income_30_40k "Income [$30k-40k)"
label var income_40_50k "Income [$40k-50k)"
label var income_50_60k "Income [$50k-60k)"
label var income_60_70k "Income [$60k-70k)"
label var income_70_80k "Income [$70k-80k)"
label var income_80_90k "Income [$80k-90k)"
label var income_90_100k "Income [$90k-100k)"
gen income_lt30k = (fin_income_m2<30000) if !missing(fin_income_m2)
gen income_30_60k = (fin_income_m2<60000 & fin_income_m2>=30000) if !missing(fin_income_m2)
gen income_60_90k = (fin_income_m2<90000 & fin_income_m2>=60000) if !missing(fin_income_m2)
gen income_gt90k = (fin_income_m2>=90000) if !missing(fin_income_m2)
label var income_lt30k "Income <$30k"
label var income_30_60k "Income [$30k-60k)"
label var income_60_90k "Income [$60k-90k)"
label var income_gt90k "Income >=$90k"
gen income_lt20k = (fin_income_m2<20000) if !missing(fin_income_m2)
gen income_20_40k = (fin_income_m2<40000 & fin_income_m2>=20000) if !missing(fin_income_m2)
gen income_gt40k = (fin_income_m2>=40000) if !missing(fin_income_m2)
label var income_lt20k "Income <$20k"
label var income_20_40k "Income [$20k-40k)"
label var income_gt40k "Income >$40k"

save "${path_md}/representative test.dta" , replace


/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
GENERATE MAPS COMPARING THE GEOGRAPHIC CONCENTRATION OF AFS BORROWERS IN SURVEY DATA TO THAT OF CLARITY DATA
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

/*-------------------------------------------
Prep survey data for map (HFS & SEIC)
-------------------------------------------*/
eststo clear
use "${path_md}/representative test.dta" , clear

*MUST CHANGE DEPENDING ON GRAPH BEING CREATED (SEIC OR HFS)
* restrict to SEIC 														
keep if (samp_3g==2 | samp_3g==3) // "SEIC 5/20 - 5/21"  
* restrict to HFS 
*keep if samp_3g==1 // "IRSFFA 2018-19"

drop afs_all_any afs_all_num
egen afs_all_num = rowtotal(afs_auto afs_check afs_lns_any afs_order afs_pawn afs_payday afs_payroll afs_refund afs_rent afs_svc_any afs_wire)
g afs_all_any = (afs_all_num>0)

* relevant vars (in Clarity and survey): 
keep index afs_all_any  zip state_abbrev age fin_income
rename   state_abbrev  state
tab afs_all_any // 17%

* first group by state
collapse (sum) num_afs_borrowers = afs_all_any  (count) num_people = afs_all_any  , by(state) 
keep if !missing(state)
g pct_afs_borrowers = num_afs_borrowers * 100 / num_people
replace pct_afs_borrowers = round(pct_afs_borrowers)

* state share of all afs borrowers in sample (NOT USED AS DEPVAR BECAUSE A HIGH PROPORTION JUST REPRESENTS LARGE POPULATION STATES)
egen sum_num_afs_borrowers = sum(num_afs_borrowers) // all afs borrowers across all states summed
g state_pct_all_afs_borrowers = num_afs_borrowers * 100 / sum_num_afs_borrowers
replace state_pct_all_afs_borrowers = round(state_pct_all_afs_borrowers, 0.1)

* statastates is a simple Stata module for adding U.S. state identifiers (abbreviation, FIPS code, and name) that may be missing from your dataset.
* https://github.com/wschpero/statastates
*ssc install statastates, replace
*net install statastates, from(https://raw.github.com/wschpero/statastates/master/) replace
*For example, if you have the state abbreviations in your dataset under the variable "state" and want to merge state FIPS codes and names using that variable, enter the following:
statastates, abbreviation(state)
drop _merge
drop if missing(state_fips)
*drop duplicates
sort state_fips
quietly by state_fips: gen dup = cond(_N == 1,0,_n)
drop if dup > 1 /* 0 observations deleted) */
drop dup
save "${path_md}/ready to mapt.dta" , replace

/*-------------------------------------------
map surveys (HFS & SEIC)
-------------------------------------------*/
*ssc install shp2dta
*ssc install spmap
* ssc install maptile
*https://michaelstepner.com/maptile/ 
* good examples: help maptile

* import state geography file
maptile_install using "http://files.michaelstepner.com/geo_state.zip"
// Geographic ID variable:
// state (containing two-letter state abbreviations)
// statefips (containing two-digit FIPS codes)
// statename (containing unabbreviated state names)

* import survey map data
use "${path_md}/ready to mapt.dta", clear
*su pct_afs_borrowers // SEIC min 6 max 31  ;  HFS min 7 max 30
rename state_fips statefips
rename state_name statename // must rename to match the geography file
* dependent variable for maps
global depvar pct_afs_borrowers //   state_pct_all_afs_borrowers       !!!

* MAPS ---
* Basic
maptile $depvar , geo(state)
* Try coloring each bin proportionally to its median value.
maptile $depvar , geo(state) propcolor twopt(legend(title("% of Sample"))) 
matrix list r(midpoints)
*  now try coloring states individually and displaying a full spectrum in the legend.
maptile $depvar , geo(state) spopt(legstyle(3)) cutvalues(5.0(6)31.0) fcolor(Reds)
su pct_afs_borrowers
*   Try the "Reds" color scheme  built into spmap.
maptile $depvar , geo(state)  fcolor(Reds)  
*  Let's add a legend title.
maptile $depvar , geo(state) nq(4) fcolor(Reds)  legd(0) twopt(legend(title("%"))) // USED!

/*-------------------------------------------
Prep Clarity data for map
-------------------------------------------*/
/*We have a nationwide random sample of 2.5 million and 0.5 million individuals, respectively.
These data were captured during Q1 2014 through Q4 2020.  We have a random (~4%) sample of Clarity's afs borrowers sample.
This means (a) states with a larger Clarity sample could just be larger population states (i.e., we need a denominator that 
reflects the population); and (b) we cannot blow up our sample to make it reflective of the entire Clarity database
because we don't know what share (x) of that database we see. Thus, our afs share of population numbers will necessarily
be small since we do not even see the entire Clarity database, just an x% random sample of it. Still, we can shed light
on whether the same states that have a large share of survey respondents who are AFS borrowers, also tend to have a 
larger Clarity sample as a portion of the state's population. 
*/

* import inquiries files
* add in the 2020 data
import delim using "$path_clarity/Inquiries_Return_to_Colorado_20210901.csv", stringcols(1 2 4 5 7 8 9) clear
append using "$path_clarity/Inquiries.dta"
* fix dates
generate numdate = date(inquiry_received_date, "YMD")
format numdate %td
g year = year(numdate)
tab year
* how many unique id's appear in the file:
sort  colorado_identity_id numdate 
cap drop nvals
by colorado_identity_id, sort: gen nvals = _n == 1 
count if nvals // 2,592,099
drop nvals
save "$path_clarity/Clarity_2014_2020.dta" , replace

* collapse to an individual by state by year file
use "$path_clarity/Clarity_2014_2020.dta", clear
sort  colorado_identity_id state numdate 
* keep only the first time a colorado_identity_id shows up in a state year combination (1 obs per unique person in a state year)
gen first_id = 0
bysort colorado_identity_id state year: replace first_id = 1 if _n == 1
keep if first_id==1
* collapse to a state by year file
collapse (sum) num_clarity_borrowers = first_id , by(state year) 
drop if missing(state)
* add state identifiers
*statastates is a simple Stata module for adding U.S. state identifiers (abbreviation, FIPS code, and name) that may be missing from your dataset.
* https://github.com/wschpero/statastates
*ssc install statastates, replace
*net install statastates, from(https://raw.github.com/wschpero/statastates/master/) replace
*For example, if you have the state abbreviations in your dataset under the variable "state" and want to merge state FIPS codes and names using that variable, enter the following:
statastates, abbreviation(state)
drop _merge
drop if missing(state_fips)
save "$path_clarity/Clarity_2014_2020_num_id_by_stateyear.dta" , replace

* GET ACS and merge in state-year adult population (denominator)
*ssc install getcensus
* help getcensus
*replace XYZ w/ your key
global censuskey 4825041b9d4c68a53c196eac995ede03fca13420
* find census tables: https://www.census.gov/programs-surveys/acs/technical-documentation/table-shells.2020.html#list-tab-79594641
*get populations and include as a column in collapsed clarity data  
getcensus B01003, year(2014-2020) sample(5) geography(state) noerror clear // B01003 is the "Total Population" table (only 1 variable exists in table)
destring state, generate(state_fips)
g state_name = strupper(name)
drop geo_id name state
save "$path_clarity/ACS_pop_2014_2020.dta" , replace
getcensus B09001_001, year(2014-2020) sample(5) geography(state) noerror clear // B09001	POPULATION UNDER 18 YEARS BY AGE (_001 is the first variable in table)
destring state, generate(state_fips)
g state_name = strupper(name)
drop name state
save "$path_clarity/ACS_adultpop_2014_2020.dta" , replace
* merge ACS into collapsed clarity
use "$path_clarity/Clarity_2014_2020_num_id_by_stateyear.dta", clear
sort year state_fips
merge 1:1  year state_fips using "$path_clarity/ACS_pop_2014_2020"
drop if missing(state) // just puerto rico unmatched
drop _merge 
merge 1:1  year state_fips using "$path_clarity/ACS_adultpop_2014_2020"
drop if missing(state) // just puerto rico unmatched
drop _merge 
* calculate adult pop as total pop - pop under 18 
g adult_pop = b01003_001e - b09001_001e
* calculate clarity borrower share of adult pop by state year
g clarity_share_of_pop = num_clarity_borrowers *100 / adult_pop
* average by state across years
collapse (mean) avg_clarity_share_of_pop = clarity_share_of_pop , by(state state_fips state_name ) 
* Clarity reports having 63 million consumers at the time of our pull (2020), but we have a sample of 2,592,099 in the inquiries file,
*	implying that we have a 4.1% random sample. 
*   implying a multiple of 24.30 =  63,000,000 / 2,592,099 
g expand_avg_clarity_share_of_pop = avg_clarity_share_of_pop * 24.30
replace expand_avg_clarity_share_of_pop = round(expand_avg_clarity_share_of_pop)
replace avg_clarity_share_of_pop = round(avg_clarity_share_of_pop, 0.01)
* save for mapping
save "$path_clarity/Clarity_share_of_state_pop.dta" , replace

/*-------------------------------------------
map clarity
-------------------------------------------*/
* import state geography file
maptile_install using "http://files.michaelstepner.com/geo_state.zip"
// Geographic ID variable:
// state (containing two-letter state abbreviations)
// statefips (containing two-digit FIPS codes)
// statename (containing unabbreviated state names)
* import survey map data
use "$path_clarity/Clarity_share_of_state_pop.dta", clear
*su pct_afs_borrowers // SEIC min 6 max 31  ;  HFS min 7 max 30
rename state_fips statefips
rename state_name statename // must rename to match the geography file
* dependent variable for maps
global depvar expand_avg_clarity_share_of_pop // avg_clarity_share_of_pop expand_avg_clarity_share_of_pop       !!!
* MAPS ---
maptile $depvar , geo(state) nq(4) fcolor(Reds)  legd(0) twopt(legend(title("%")))

